import pandas as pd
# URLs for OICA vehicles data (production and sales by country & type)
# Hosted CSVs via GitHub (jhelvy/oica dataset)
URL_PRODUCTION = "https://raw.githubusercontent.com/jhelvy/oica/main/data-raw/production.csv"
URL_SALES = "https://raw.githubusercontent.com/jhelvy/oica/main/data-raw/sales_country.csv"
# Load the datasets
production_df = pd.read_csv(URL_PRODUCTION)
sales_df = pd.read_csv(URL_SALES)
# Filter for commercial vehicles only (type == 'cv')
prod_cv = production_df[production_df['type'] == 'cv'].copy()
sales_cv = sales_df[sales_df['type'] == 'cv'].copy()
# Define our target countries
target_countries = [
"India", "Japan", "Thailand",
"South Africa", "United Arab Emirates", "Brazil", "Mexico", "Argentina"
]
# Combine Latin America region countries: Brazil, Mexico, Argentina
# (You can adjust this set as needed.)
latam_countries = ["Brazil", "Mexico", "Argentina"]
# Filter production & sales
prod_cv = prod_cv[prod_cv['country'].isin(target_countries)]
sales_cv = sales_cv[sales_cv['country'].isin(target_countries)]
# Summarize annual totals
prod_summary = prod_cv.pivot_table(
index="year", columns="country", values="n", aggfunc="sum"
).reset_index()
sales_summary = sales_cv.pivot_table(
index="year", columns="country", values="n", aggfunc="sum"
).reset_index()
# Show result
#print("=== Annual Commercial Vehicle Production ===")
#print(prod_summary)
print("\n=== Annual Commercial Vehicle Sales ===")
print(sales_summary)
# Save to CSV
prod_summary.to_csv("annual_commercial_vehicle_production.csv", index=False)
sales_summary.to_csv("annual_commercial_vehicle_sales.csv", index=False)
print("\nSaved CSV files: 'annual_commercial_vehicle_sales.csv'")
=== Annual Commercial Vehicle Sales === country year Argentina Brazil India Japan Mexico \ 0 2005 112042.0 274822.0 333592.0 1103552.0 454498.0 1 2006 124182.0 294791.0 439519.0 1127202.0 503089.0 2 2007 142696.0 377010.0 481909.0 983692.0 509425.0 3 2008 159231.0 479050.0 437657.0 897967.0 484719.0 4 2009 113911.0 497378.0 449391.0 704023.0 336631.0 5 2010 175813.0 658524.0 653193.0 752967.0 344606.0 6 2011 209497.0 731601.0 777424.0 701188.0 344679.0 7 2012 233661.0 686848.0 813589.0 797388.0 375241.0 8 2013 279538.0 726587.0 687323.0 813231.0 402325.0 9 2014 181152.0 703325.0 606269.0 863297.0 431055.0 10 2015 163069.0 445967.0 652566.0 830621.0 497280.0 11 2016 183725.0 373599.0 702640.0 823801.0 581811.0 12 2017 198782.0 316288.0 830346.0 847788.0 546236.0 13 2018 192609.0 464310.0 1005422.0 880907.0 538415.0 14 2019 126375.0 525781.0 854839.0 894125.0 597951.0 15 2020 102183.0 442495.0 505102.0 788634.0 445217.0 16 2021 128664.0 561384.0 677119.0 772642.0 526593.0 17 2022 134740.0 527799.0 933116.0 753023.0 647481.0 18 2023 164445.0 587289.0 978385.0 786359.0 815830.0 country South Africa Thailand United Arab Emirates 0 197538.0 514215.0 20000.0 1 232757.0 490409.0 25000.0 2 241455.0 458860.0 30000.0 3 204125.0 388465.0 42048.0 4 137093.0 318833.0 114850.0 5 155777.0 453713.0 31344.0 6 175949.0 433640.0 50714.0 7 183919.0 763366.0 57400.0 8 200184.0 666926.0 56400.0 9 205240.0 470430.0 56400.0 10 205079.0 443569.0 42700.0 11 186117.0 440735.0 32100.0 12 186117.0 340191.0 12800.0 13 186984.0 560093.0 7646.0 14 181233.0 538914.0 28000.0 15 126092.0 448652.0 28810.0 16 160153.0 436380.0 32064.0 17 150788.0 506039.0 36125.0 18 184399.0 369279.0 33753.0 Saved CSV files: 'annual_commercial_vehicle_sales.csv'
import pandas as pd
import numpy as np
# Monthly seasonal weights (must sum to 1)
seasonal_weights = {
1: 0.06,
2: 0.06,
3: 0.10,
4: 0.08,
5: 0.08,
6: 0.09,
7: 0.07,
8: 0.07,
9: 0.09,
10: 0.12,
11: 0.10,
12: 0.08
}
print("Total weight:", sum(seasonal_weights.values()))
Total weight: 1.0
# Load annual data
annual_df = pd.read_csv("annual_commercial_vehicle_sales.csv")
# Melt to long format
annual_long = annual_df.melt(id_vars="year",
var_name="country",
value_name="annual_total")
monthly_data = []
for _, row in annual_long.iterrows():
for month in range(1, 13):
monthly_value = row["annual_total"] * seasonal_weights[month]
monthly_data.append({
"country": row["country"],
"year": row["year"],
"month": month,
"volume": monthly_value
})
monthly_df = pd.DataFrame(monthly_data)
# removing fractions
monthly_df = monthly_df.round(0)
numeric_cols = monthly_df.select_dtypes(include=['float64', 'int64']).columns
monthly_df[numeric_cols] = monthly_df[numeric_cols].astype(int)
# Create datetime column
monthly_df["ds"] = pd.to_datetime(
monthly_df[["year", "month"]].assign(day=1)
)
monthly_df.to_csv("monthly_commercial_vehicle_data.csv", index=False)
print("Monthly dataset created!")
print(monthly_df.head())
Monthly dataset created!
country year month volume ds
0 Argentina 2005 1 6723 2005-01-01
1 Argentina 2005 2 6723 2005-02-01
2 Argentina 2005 3 11204 2005-03-01
3 Argentina 2005 4 8963 2005-04-01
4 Argentina 2005 5 8963 2005-05-01
import pandas as pd
# Load monthly data
df = pd.read_csv("monthly_commercial_vehicle_data.csv")
# Define country-wise ratios
ratios = {
"India": {"LD": 0.55, "MD": 0.25, "HD": 0.20},
"Japan": {"LD": 0.65, "MD": 0.20, "HD": 0.15},
"Thailand": {"LD": 0.60, "MD": 0.25, "HD": 0.15},
"South Africa": {"LD": 0.50, "MD": 0.30, "HD": 0.20},
"United Arab Emirates": {"LD": 0.45, "MD": 0.30, "HD": 0.25},
"Brazil": {"LD": 0.58, "MD": 0.27, "HD": 0.15},
"Mexico": {"LD": 0.58, "MD": 0.27, "HD": 0.15},
"Argentina": {"LD": 0.58, "MD": 0.27, "HD": 0.15}
}
# Apply split
df["LD"] = df.apply(lambda x: x["volume"] * ratios[x["country"]]["LD"], axis=1)
df["MD"] = df.apply(lambda x: x["volume"] * ratios[x["country"]]["MD"], axis=1)
df["HD"] = df.apply(lambda x: x["volume"] * ratios[x["country"]]["HD"], axis=1)
# Round all numeric columns and convert to int
df = df.round(0)
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numeric_cols] = df[numeric_cols].astype(int)
# Save
df.to_csv("monthly_LD_MD_HD_split.csv", index=False)
print("LD / MD / HD split completed!")
print(df.head())
LD / MD / HD split completed!
country year month volume ds LD MD HD
0 Argentina 2005 1 6723 2005-01-01 3899 1815 1008
1 Argentina 2005 2 6723 2005-02-01 3899 1815 1008
2 Argentina 2005 3 11204 2005-03-01 6498 3025 1681
3 Argentina 2005 4 8963 2005-04-01 5199 2420 1344
4 Argentina 2005 5 8963 2005-05-01 5199 2420 1344
import pandas as pd
import matplotlib.pyplot as plt
# Convert to yearly totals
yearly_total = (
df.groupby("year")[["LD", "MD", "HD"]]
.sum()
.reset_index()
.sort_values("year")
)
# Plot (Strict X=Year, Y=Volume)
plt.figure()
plt.plot(yearly_total["year"], yearly_total["LD"])
plt.plot(yearly_total["year"], yearly_total["MD"])
plt.plot(yearly_total["year"], yearly_total["HD"])
plt.xlabel("year")
plt.ylabel("Volume")
plt.title("Year-wise LD, MD, HD Volume Trend")
plt.xticks(yearly_total["year"])
plt.show()
import matplotlib.pyplot as plt
import pandas as pd
# Ensure year column is numeric
df["year"] = df["year"].astype(int)
countries = df["country"].unique()
years = list(range(2005, 2024)) # 2005 to 2023
for c in countries:
country_data = df[df["country"] == c]
# Aggregate yearly volumes
yearly_data = (
country_data.groupby("year")[["LD", "MD", "HD"]]
.sum()
.reset_index()
.sort_values("year")
)
# Total Volume
yearly_data["Total"] = yearly_data["LD"] + yearly_data["MD"] + yearly_data["HD"]
# Plot
plt.figure(figsize=(12,6)) # bigger figure
# Continuous lines (no markers)
plt.plot(yearly_data["year"], yearly_data["Total"], label="Total", linewidth=2)
plt.plot(yearly_data["year"], yearly_data["LD"], label="LD", linewidth=2)
plt.plot(yearly_data["year"], yearly_data["MD"], label="MD", linewidth=2)
plt.plot(yearly_data["year"], yearly_data["HD"], label="HD", linewidth=2)
# Labels, title, legend
plt.xlabel("Year", fontsize=14)
plt.ylabel("Volume", fontsize=14)
plt.title(f"{c} - Yearly LD, MD, HD & Total Volume Trend", fontsize=16)
# Strict X-axis: show only years 2005–2023
plt.xticks(years, rotation=45, fontsize=12)
# Increase font size for Y-axis and legend
plt.yticks(fontsize=12)
plt.legend(fontsize=12)
# Grid for better readability
plt.grid(True, linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()
import requests
import pandas as pd
# -----------------------------------
# Config
# -----------------------------------
countries = {
"India": "IND",
"Japan": "JPN",
"Thailand": "THA",
"South Africa": "ZAF",
"United Arab Emirates": "ARE",
"Brazil": "BRA",
"Mexico": "MEX",
"Argentina": "ARG"
}
start_year = 2005
end_year = 2023
# -----------------------------------
# Fetch Annual GDP Data from World Bank
# -----------------------------------
def fetch_gdp_world_bank(country_code):
"""
Fetch annual GDP (current USD) for a country code
from World Bank API for years 2005–2023.
"""
url = (
f"https://api.worldbank.org/v2/country/{country_code}"
f"/indicator/NY.GDP.MKTP.CD?date={start_year}:{end_year}&format=json"
)
response = requests.get(url)
if response.status_code != 200:
print(f"Failed to fetch for {country_code}")
return {}
data = response.json()
gdp_dict = {}
# Data is usually in data[1]
for entry in data[1]:
year = int(entry["date"])
gdp_value = entry["value"]
if gdp_value is not None:
gdp_dict[year] = float(gdp_value)
return gdp_dict
# -----------------------------------
# Build Full Monthly Dataset
# -----------------------------------
all_rows = []
for country_name, iso_code in countries.items():
yearly_data = fetch_gdp_world_bank(iso_code)
for year in range(start_year, end_year + 1):
# If missing, fill with 0 or NaN
annual_gdp = yearly_data.get(year, None)
if annual_gdp is None:
continue # skip missing data
monthly_value = annual_gdp / 12
for month in range(1, 13):
all_rows.append({
"Country": country_name,
"Year": year,
"Month": month,
"Annual_GDP_USD": annual_gdp,
"Monthly_GDP_USD": round(monthly_value, 2)
})
# Create DataFrame
gdp_df = pd.DataFrame(all_rows)
# Save CSV
gdp_df.to_csv("gdp_worldbank_monthly_2005_2023.csv", index=False)
print("Saved dataset with monthly GDP estimates.")
print(gdp_df.head())
Saved dataset with monthly GDP estimates. Country Year Month Annual_GDP_USD Monthly_GDP_USD 0 India 2005 1 8.203838e+11 6.836531e+10 1 India 2005 2 8.203838e+11 6.836531e+10 2 India 2005 3 8.203838e+11 6.836531e+10 3 India 2005 4 8.203838e+11 6.836531e+10 4 India 2005 5 8.203838e+11 6.836531e+10
import pandas as pd
import matplotlib.pyplot as plt
# Load dataset
df = pd.read_csv("gdp_worldbank_monthly_with_growth_2005_2023.csv")
# Extract yearly GDP (remove monthly duplicates)
yearly_gdp = (
df.groupby(["Country", "Year"])["Annual_GDP_USD"]
.first()
.reset_index()
)
# Filter years
yearly_gdp = yearly_gdp[
(yearly_gdp["Year"] >= 2005) &
(yearly_gdp["Year"] <= 2023)
]
# Plot
plt.figure()
for country in yearly_gdp["Country"].unique():
subset = yearly_gdp[yearly_gdp["Country"] == country]
plt.plot(subset["Year"], subset["Annual_GDP_USD"], label=country)
plt.xlabel("Year")
plt.ylabel("Annual GDP (USD)")
plt.title("Yearly GDP Trend (2005–2023)")
plt.xticks(range(2005, 2024, 2)) # show year labels clearly
plt.legend()
plt.show()
# Load truck data
cv_df = pd.read_csv("monthly_LD_MD_HD_split.csv")
cv_df
| country | year | month | volume | ds | LD | MD | HD | |
|---|---|---|---|---|---|---|---|---|
| 0 | Argentina | 2005 | 1 | 6723 | 2005-01-01 | 3899 | 1815 | 1008 |
| 1 | Argentina | 2005 | 2 | 6723 | 2005-02-01 | 3899 | 1815 | 1008 |
| 2 | Argentina | 2005 | 3 | 11204 | 2005-03-01 | 6498 | 3025 | 1681 |
| 3 | Argentina | 2005 | 4 | 8963 | 2005-04-01 | 5199 | 2420 | 1344 |
| 4 | Argentina | 2005 | 5 | 8963 | 2005-05-01 | 5199 | 2420 | 1344 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1819 | United Arab Emirates | 2023 | 8 | 2363 | 2023-08-01 | 1063 | 709 | 591 |
| 1820 | United Arab Emirates | 2023 | 9 | 3038 | 2023-09-01 | 1367 | 911 | 760 |
| 1821 | United Arab Emirates | 2023 | 10 | 4050 | 2023-10-01 | 1822 | 1215 | 1012 |
| 1822 | United Arab Emirates | 2023 | 11 | 3375 | 2023-11-01 | 1519 | 1012 | 844 |
| 1823 | United Arab Emirates | 2023 | 12 | 2700 | 2023-12-01 | 1215 | 810 | 675 |
1824 rows × 8 columns
# Load macro datasets
gdp = pd.read_csv("gdp_worldbank_monthly_with_growth_2005_2023.csv")
# Suppose your GDP DataFrame is called df_gdp
gdp.rename(columns={"Country": "country"}, inplace=True)
gdp.rename(columns={"Year": "year"}, inplace=True)
# Verify
gdp
| country | year | Month | Annual_GDP_USD | Monthly_GDP_USD | GDP_Growth_% | |
|---|---|---|---|---|---|---|
| 0 | India | 2005 | 1 | 8.203838e+11 | 6.836531e+10 | 0.00 |
| 1 | India | 2005 | 2 | 8.203838e+11 | 6.836531e+10 | 0.00 |
| 2 | India | 2005 | 3 | 8.203838e+11 | 6.836531e+10 | 0.00 |
| 3 | India | 2005 | 4 | 8.203838e+11 | 6.836531e+10 | 0.00 |
| 4 | India | 2005 | 5 | 8.203838e+11 | 6.836531e+10 | 0.00 |
| ... | ... | ... | ... | ... | ... | ... |
| 1819 | Argentina | 2023 | 8 | 6.494617e+11 | 5.412181e+10 | 2.44 |
| 1820 | Argentina | 2023 | 9 | 6.494617e+11 | 5.412181e+10 | 2.44 |
| 1821 | Argentina | 2023 | 10 | 6.494617e+11 | 5.412181e+10 | 2.44 |
| 1822 | Argentina | 2023 | 11 | 6.494617e+11 | 5.412181e+10 | 2.44 |
| 1823 | Argentina | 2023 | 12 | 6.494617e+11 | 5.412181e+10 | 2.44 |
1824 rows × 6 columns
merged_df = pd.merge(cv_df, gdp, on=["country", "year"], how="inner")
print(merged_df)
country year month volume ds LD MD \
0 Argentina 2005 1 6723 2005-01-01 3899 1815
1 Argentina 2005 1 6723 2005-01-01 3899 1815
2 Argentina 2005 1 6723 2005-01-01 3899 1815
3 Argentina 2005 1 6723 2005-01-01 3899 1815
4 Argentina 2005 1 6723 2005-01-01 3899 1815
... ... ... ... ... ... ... ...
21883 United Arab Emirates 2023 12 2700 2023-12-01 1215 810
21884 United Arab Emirates 2023 12 2700 2023-12-01 1215 810
21885 United Arab Emirates 2023 12 2700 2023-12-01 1215 810
21886 United Arab Emirates 2023 12 2700 2023-12-01 1215 810
21887 United Arab Emirates 2023 12 2700 2023-12-01 1215 810
HD Month Annual_GDP_USD Monthly_GDP_USD GDP_Growth_%
0 1008 1 1.987371e+11 1.656142e+10 0.00
1 1008 2 1.987371e+11 1.656142e+10 0.00
2 1008 3 1.987371e+11 1.656142e+10 0.00
3 1008 4 1.987371e+11 1.656142e+10 0.00
4 1008 5 1.987371e+11 1.656142e+10 0.00
... ... ... ... ... ...
21883 675 8 5.226222e+11 4.355185e+10 2.19
21884 675 9 5.226222e+11 4.355185e+10 2.19
21885 675 10 5.226222e+11 4.355185e+10 2.19
21886 675 11 5.226222e+11 4.355185e+10 2.19
21887 675 12 5.226222e+11 4.355185e+10 2.19
[21888 rows x 12 columns]
merged_df.to_csv("monthly_cv_with_macro.csv", index=False)
print("Final macro-enhanced dataset ready!")
Final macro-enhanced dataset ready!
import pandas as pd
df = pd.read_csv("monthly_cv_with_macro.csv")
print(df.columns)
Index(['country', 'year', 'month', 'volume', 'ds', 'LD', 'MD', 'HD', 'Month',
'Annual_GDP_USD', 'Monthly_GDP_USD', 'GDP_Growth_%'],
dtype='object')
pip install requests pandas
Requirement already satisfied: requests in c:\users\hp\documents\anaconda3_new\lib\site-packages (2.32.3) Requirement already satisfied: pandas in c:\users\hp\documents\anaconda3_new\lib\site-packages (2.3.0) Requirement already satisfied: urllib3<3,>=1.21.1 in c:\users\hp\documents\anaconda3_new\lib\site-packages (from requests) (2.6.3) Requirement already satisfied: certifi>=2017.4.17 in c:\users\hp\documents\anaconda3_new\lib\site-packages (from requests) (2026.1.4) Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\hp\documents\anaconda3_new\lib\site-packages (from requests) (2.0.4) Requirement already satisfied: idna<4,>=2.5 in c:\users\hp\documents\anaconda3_new\lib\site-packages (from requests) (3.3) Requirement already satisfied: tzdata>=2022.7 in c:\users\hp\documents\anaconda3_new\lib\site-packages (from pandas) (2025.2) Requirement already satisfied: numpy>=1.22.4 in c:\users\hp\documents\anaconda3_new\lib\site-packages (from pandas) (1.26.4) Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\hp\documents\anaconda3_new\lib\site-packages (from pandas) (2.8.2) Requirement already satisfied: pytz>=2020.1 in c:\users\hp\documents\anaconda3_new\lib\site-packages (from pandas) (2025.2) Requirement already satisfied: six>=1.5 in c:\users\hp\documents\anaconda3_new\lib\site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0) Note: you may need to restart the kernel to use updated packages.
pip install requests pandas beautifulsoup4 lxml
Requirement already satisfied: requests in c:\users\hp\documents\anaconda3_new\lib\site-packages (2.32.3)Note: you may need to restart the kernel to use updated packages. Requirement already satisfied: pandas in c:\users\hp\documents\anaconda3_new\lib\site-packages (2.3.0) Requirement already satisfied: beautifulsoup4 in c:\users\hp\documents\anaconda3_new\lib\site-packages (4.11.1) Requirement already satisfied: lxml in c:\users\hp\documents\anaconda3_new\lib\site-packages (4.9.1) Requirement already satisfied: certifi>=2017.4.17 in c:\users\hp\documents\anaconda3_new\lib\site-packages (from requests) (2026.1.4) Requirement already satisfied: urllib3<3,>=1.21.1 in c:\users\hp\documents\anaconda3_new\lib\site-packages (from requests) (2.6.3) Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\hp\documents\anaconda3_new\lib\site-packages (from requests) (2.0.4) Requirement already satisfied: idna<4,>=2.5 in c:\users\hp\documents\anaconda3_new\lib\site-packages (from requests) (3.3) Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\hp\documents\anaconda3_new\lib\site-packages (from pandas) (2.8.2) Requirement already satisfied: pytz>=2020.1 in c:\users\hp\documents\anaconda3_new\lib\site-packages (from pandas) (2025.2) Requirement already satisfied: numpy>=1.22.4 in c:\users\hp\documents\anaconda3_new\lib\site-packages (from pandas) (1.26.4) Requirement already satisfied: tzdata>=2022.7 in c:\users\hp\documents\anaconda3_new\lib\site-packages (from pandas) (2025.2) Requirement already satisfied: soupsieve>1.2 in c:\users\hp\documents\anaconda3_new\lib\site-packages (from beautifulsoup4) (2.3.1) Requirement already satisfied: six>=1.5 in c:\users\hp\documents\anaconda3_new\lib\site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)
import pandas as pd
import numpy as np
# ----------------------------------
# CONFIGURATION
# ----------------------------------
START_YEAR = 2005
END_YEAR = 2023
dates = pd.date_range(
start=f"{START_YEAR}-01-01",
end=f"{END_YEAR}-12-01",
freq="MS"
)
# ----------------------------------
# Country Tax Structures (%)
# ----------------------------------
countries = {
"India": 0.55,
"Japan": 0.40,
"Thailand": 0.35,
"South Africa": 0.30,
"United Arab Emirates": 0.10,
"Brazil": 0.45,
"Mexico": 0.38,
"Argentina": 0.42
}
# ----------------------------------
# Function: Realistic Crude Oil Price
# ----------------------------------
def crude_price(year):
if 2005 <= year <= 2007:
return 60
elif 2008:
return 100
elif 2009 <= year <= 2013:
return 90
elif 2014 <= year <= 2016:
return 50
elif 2017 <= year <= 2019:
return 65
elif year == 2020:
return 40
elif year == 2021:
return 70
elif year >= 2022:
return 95
# ----------------------------------
# Generate Data
# ----------------------------------
data = []
for country, tax_rate in countries.items():
for date in dates:
year = date.year
# Base crude oil price
crude = crude_price(year)
# Add monthly volatility
crude += np.random.normal(0, 3)
# Convert barrel to liter
crude_per_liter = crude / 159
# Refining + transport cost
refining_cost = 0.15
# Pre-tax base price
base_price = crude_per_liter + refining_cost
# Add country tax
petrol_price = base_price * (1 + tax_rate)
# Diesel usually slightly lower tax
diesel_price = base_price * (1 + (tax_rate * 0.9))
data.append([
country,
date.year,
date.month,
date.strftime("%Y-%m"),
round(petrol_price, 3),
round(diesel_price, 3)
])
# ----------------------------------
# Create DataFrame
# ----------------------------------
df = pd.DataFrame(data, columns=[
"Country",
"Year",
"Month",
"Year-Month",
"Petrol_USD_per_Liter",
"Diesel_USD_per_Liter"
])
df.to_csv("fuel_petrol_diesel_2005_2023_usd.csv", index=False)
print(df.head())
print("Total rows:", len(df))
Country Year Month Year-Month Petrol_USD_per_Liter Diesel_USD_per_Liter 0 India 2005 1 2005-01 0.828 0.799 1 India 2005 2 2005-02 0.799 0.770 2 India 2005 3 2005-03 0.828 0.799 3 India 2005 4 2005-04 0.862 0.832 4 India 2005 5 2005-05 0.816 0.787 Total rows: 1824
df
| Country | Year | Month | Year-Month | Petrol_USD_per_Liter | Diesel_USD_per_Liter | |
|---|---|---|---|---|---|---|
| 0 | India | 2005 | 1 | 2005-01 | 0.828 | 0.799 |
| 1 | India | 2005 | 2 | 2005-02 | 0.799 | 0.770 |
| 2 | India | 2005 | 3 | 2005-03 | 0.828 | 0.799 |
| 3 | India | 2005 | 4 | 2005-04 | 0.862 | 0.832 |
| 4 | India | 2005 | 5 | 2005-05 | 0.816 | 0.787 |
| ... | ... | ... | ... | ... | ... | ... |
| 1819 | Argentina | 2023 | 8 | 2023-08 | 1.117 | 1.084 |
| 1820 | Argentina | 2023 | 9 | 2023-09 | 1.093 | 1.061 |
| 1821 | Argentina | 2023 | 10 | 2023-10 | 1.106 | 1.073 |
| 1822 | Argentina | 2023 | 11 | 2023-11 | 1.121 | 1.088 |
| 1823 | Argentina | 2023 | 12 | 2023-12 | 1.106 | 1.074 |
1824 rows × 6 columns
df
| Country | Year | Month | Year-Month | Petrol_USD_per_Liter | Diesel_USD_per_Liter | |
|---|---|---|---|---|---|---|
| 0 | India | 2005 | 1 | 2005-01 | 0.828 | 0.799 |
| 1 | India | 2005 | 2 | 2005-02 | 0.799 | 0.770 |
| 2 | India | 2005 | 3 | 2005-03 | 0.828 | 0.799 |
| 3 | India | 2005 | 4 | 2005-04 | 0.862 | 0.832 |
| 4 | India | 2005 | 5 | 2005-05 | 0.816 | 0.787 |
| ... | ... | ... | ... | ... | ... | ... |
| 1819 | Argentina | 2023 | 8 | 2023-08 | 1.117 | 1.084 |
| 1820 | Argentina | 2023 | 9 | 2023-09 | 1.093 | 1.061 |
| 1821 | Argentina | 2023 | 10 | 2023-10 | 1.106 | 1.073 |
| 1822 | Argentina | 2023 | 11 | 2023-11 | 1.121 | 1.088 |
| 1823 | Argentina | 2023 | 12 | 2023-12 | 1.106 | 1.074 |
1824 rows × 6 columns
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
# -----------------------------
# ASSUME DATAFRAME IS df
# -----------------------------
# Columns: 'country', 'year', 'Diesel_USD_per_Liter'
# Ensure column names are lowercase
df = df.rename(columns={"Country": "country", "Year": "year"})
# Clip diesel values to max 1.5 USD for logical scale
df["Diesel_USD_per_Liter"] = df["Diesel_USD_per_Liter"].clip(upper=1.5)
# Get list of countries
countries = df["country"].unique()
# ----------------------------
# Load macro datasets
fuel_price = pd.read_csv("fuel_petrol_diesel_2005_2023_usd.csv")
# Suppose your GDP DataFrame is called df_gdp
fuel_price.rename(columns={"Country": "country"}, inplace=True)
fuel_price.rename(columns={"Year": "year"}, inplace=True)
# Verify
fuel_price
| country | year | Month | Year-Month | Petrol_USD_per_Liter | Diesel_USD_per_Liter | |
|---|---|---|---|---|---|---|
| 0 | India | 2005 | 1 | 2005-01 | 0.828 | 0.799 |
| 1 | India | 2005 | 2 | 2005-02 | 0.799 | 0.770 |
| 2 | India | 2005 | 3 | 2005-03 | 0.828 | 0.799 |
| 3 | India | 2005 | 4 | 2005-04 | 0.862 | 0.832 |
| 4 | India | 2005 | 5 | 2005-05 | 0.816 | 0.787 |
| ... | ... | ... | ... | ... | ... | ... |
| 1819 | Argentina | 2023 | 8 | 2023-08 | 1.117 | 1.084 |
| 1820 | Argentina | 2023 | 9 | 2023-09 | 1.093 | 1.061 |
| 1821 | Argentina | 2023 | 10 | 2023-10 | 1.106 | 1.073 |
| 1822 | Argentina | 2023 | 11 | 2023-11 | 1.121 | 1.088 |
| 1823 | Argentina | 2023 | 12 | 2023-12 | 1.106 | 1.074 |
1824 rows × 6 columns
merged_df1 = pd.merge(merged_df, fuel_price, on=["country", "year"], how="inner")
print(merged_df1)
country year month volume ds LD MD \
0 Argentina 2005 1 6723 2005-01-01 3899 1815
1 Argentina 2005 1 6723 2005-01-01 3899 1815
2 Argentina 2005 1 6723 2005-01-01 3899 1815
3 Argentina 2005 1 6723 2005-01-01 3899 1815
4 Argentina 2005 1 6723 2005-01-01 3899 1815
... ... ... ... ... ... ... ...
262651 United Arab Emirates 2023 12 2700 2023-12-01 1215 810
262652 United Arab Emirates 2023 12 2700 2023-12-01 1215 810
262653 United Arab Emirates 2023 12 2700 2023-12-01 1215 810
262654 United Arab Emirates 2023 12 2700 2023-12-01 1215 810
262655 United Arab Emirates 2023 12 2700 2023-12-01 1215 810
HD Month_x Annual_GDP_USD Monthly_GDP_USD GDP_Growth_% Month_y \
0 1008 1 1.987371e+11 1.656142e+10 0.00 1
1 1008 1 1.987371e+11 1.656142e+10 0.00 2
2 1008 1 1.987371e+11 1.656142e+10 0.00 3
3 1008 1 1.987371e+11 1.656142e+10 0.00 4
4 1008 1 1.987371e+11 1.656142e+10 0.00 5
... ... ... ... ... ... ...
262651 675 12 5.226222e+11 4.355185e+10 2.19 8
262652 675 12 5.226222e+11 4.355185e+10 2.19 9
262653 675 12 5.226222e+11 4.355185e+10 2.19 10
262654 675 12 5.226222e+11 4.355185e+10 2.19 11
262655 675 12 5.226222e+11 4.355185e+10 2.19 12
Year-Month Petrol_USD_per_Liter Diesel_USD_per_Liter
0 2005-01 0.784 0.760
1 2005-02 0.748 0.726
2 2005-03 0.722 0.701
3 2005-04 0.735 0.714
4 2005-05 0.771 0.749
... ... ... ...
262651 2023-08 0.868 0.860
262652 2023-09 0.858 0.850
262653 2023-10 0.858 0.850
262654 2023-11 0.842 0.835
262655 2023-12 0.843 0.835
[262656 rows x 16 columns]
merged_df1.to_csv("monthly_cv_with_macro.csv", index=False)
print("Final macro-enhanced dataset ready!")
Final macro-enhanced dataset ready!
import pandas as pd
df = pd.read_csv("monthly_cv_with_macro.csv")
print(df.columns)
Index(['country', 'year', 'month', 'volume', 'ds', 'LD', 'MD', 'HD', 'Month_x',
'Annual_GDP_USD', 'Monthly_GDP_USD', 'GDP_Growth_%', 'Month_y',
'Year-Month', 'Petrol_USD_per_Liter', 'Diesel_USD_per_Liter'],
dtype='object')
import pandas as pd
import matplotlib.pyplot as plt
# -----------------------------
# LOAD DATA
# -----------------------------
vehicle_df = pd.read_csv("monthly_cv_with_macro.csv")
fuel_df = pd.read_csv("fuel_petrol_diesel_2005_2023_usd.csv")
# Ensure Year column is lowercase 'year'
vehicle_df = vehicle_df.rename(columns={"Year": "year"})
fuel_df = fuel_df.rename(columns={"Year": "year"})
# -----------------------------
# AGGREGATE DIESEL PRICE YEARLY
# -----------------------------
diesel_yearly = fuel_df.groupby("year")["Diesel_USD_per_Liter"].mean().reset_index()
# -----------------------------
# AGGREGATE VEHICLE VOLUME BY YEAR
# -----------------------------
volume_yearly = vehicle_df.groupby("year")[["LD","MD","HD"]].sum().reset_index()
# Merge diesel price with vehicle volume
plot_df = pd.merge(volume_yearly, diesel_yearly, on="year")
# -----------------------------
# PLOTTING LINE GRAPH
# -----------------------------
fig, ax1 = plt.subplots(figsize=(12,6))
# Vehicle volume lines
line_ld, = ax1.plot(plot_df["year"], plot_df["LD"], label="LD Volume", marker='o')
line_md, = ax1.plot(plot_df["year"], plot_df["MD"], label="MD Volume", marker='s')
line_hd, = ax1.plot(plot_df["year"], plot_df["HD"], label="HD Volume", marker='^')
ax1.set_xlabel("Year", fontsize=12)
ax1.set_ylabel("Vehicle Volume (Units)", fontsize=12)
ax1.set_xticks(plot_df["year"])
ax1.set_xticklabels(plot_df["year"], rotation=45)
ax1.grid(True)
# Secondary axis for diesel price
ax2 = ax1.twinx()
line_diesel, = ax2.plot(plot_df["year"], plot_df["Diesel_USD_per_Liter"], color='black', linestyle='--', linewidth=2, label='Diesel Price (USD/L)')
ax2.set_ylabel("Diesel Price (USD per Liter)", fontsize=12)
# Combine legends
lines = [line_ld, line_md, line_hd, line_diesel]
labels = [line.get_label() for line in lines]
ax1.legend(lines, labels, loc="upper left", fontsize=10)
plt.title("Impact of Diesel Fuel Cost on LD, MD, HD Vehicle Volumes (Yearly)", fontsize=14)
plt.tight_layout()
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
# -----------------------------
# LOAD DATA
# -----------------------------
vehicle_df = pd.read_csv("monthly_cv_with_macro.csv")
fuel_df = pd.read_csv("fuel_petrol_diesel_2005_2023_usd.csv")
# Lowercase column names
vehicle_df = vehicle_df.rename(columns={"Year":"year","Country":"country"})
fuel_df = fuel_df.rename(columns={"Year":"year","Country":"country"})
# -----------------------------
# AGGREGATE DIESEL PRICE YEARLY PER COUNTRY
# -----------------------------
diesel_yearly = fuel_df.groupby(["country","year"])["Diesel_USD_per_Liter"].mean().reset_index()
# Clip diesel values to a reasonable max for plotting (e.g., 1.5 USD)
diesel_yearly["Diesel_USD_per_Liter"] = diesel_yearly["Diesel_USD_per_Liter"].clip(upper=1.5)
# -----------------------------
# AGGREGATE VEHICLE VOLUME YEARLY PER COUNTRY
# -----------------------------
volume_yearly = vehicle_df.groupby(["country","year"])[["LD","MD","HD"]].sum().reset_index()
# Merge volumes with diesel price
merged_df = pd.merge(volume_yearly, diesel_yearly, on=["country","year"], how="left")
# -----------------------------
# PLOTTING COUNTRY-WISE TRENDS
# -----------------------------
countries = merged_df["country"].unique()
num_countries = len(countries)
fig, axes = plt.subplots(nrows=(num_countries+1)//2, ncols=2, figsize=(14, 4*((num_countries+1)//2)))
axes = axes.flatten()
for i, country in enumerate(countries):
ax1 = axes[i]
country_data = merged_df[merged_df["country"]==country]
# Plot LD, MD, HD volumes
ax1.plot(country_data["year"], country_data["LD"], label="LD Volume", linewidth=2)
ax1.plot(country_data["year"], country_data["MD"], label="MD Volume", linewidth=2)
ax1.plot(country_data["year"], country_data["HD"], label="HD Volume", linewidth=2)
ax1.set_xlabel("Year")
ax1.set_ylabel("Vehicle Volume (Units)")
ax1.set_xticks(country_data["year"])
ax1.set_xticklabels(country_data["year"], rotation=45)
ax1.grid(True)
ax1.set_title(f"{country} - Vehicle Volume vs Diesel Price")
# Secondary axis for diesel price
ax2 = ax1.twinx()
ax2.plot(country_data["year"], country_data["Diesel_USD_per_Liter"], color='black',
linestyle='--', linewidth=2, label='Diesel Price (USD/L)')
ax2.set_ylabel("Diesel Price (USD per Liter)")
# Fix secondary Y-axis scale for all countries
ax2.set_ylim(0, 1.5) # max diesel scale 1.5 USD, min 0
# Combine legends
lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines1 + lines2, labels1 + labels2, loc="upper left", fontsize=8)
# Hide empty subplots
for j in range(i+1, len(axes)):
axes[j].axis('off')
plt.tight_layout()
plt.show()
df.columns = df.columns.str.strip()
df.columns
import pandas as pd
df = pd.read_csv("monthly_cv_with_macro.csv")
print(df.columns)
Index(['country', 'year', 'month', 'volume', 'ds', 'LD', 'MD', 'HD', 'Month_x',
'Annual_GDP_USD', 'Monthly_GDP_USD', 'GDP_Growth_%', 'Month_y',
'Year-Month', 'Petrol_USD_per_Liter', 'Diesel_USD_per_Liter'],
dtype='object')
import pandas as pd
import numpy as np
# ---------------------------
# Step 1: Define countries and years
# ---------------------------
countries = [
"India", "Japan", "Thailand", "South Africa",
"United Arab Emirates", "Brazil", "Mexico", "Argentina"
]
years = list(range(2005, 2024)) # 2005 to 2023
# ---------------------------
# Step 2: Annual freight volumes (in million tonnes)
# Replace these with actual annual data or fetch from API
# Example: Random synthetic data for demonstration
# ---------------------------
np.random.seed(42) # reproducible
annual_data = {}
for country in countries:
# Generate some synthetic annual data
annual_data[country] = np.random.randint(50, 500, size=len(years))
# Convert to DataFrame
df_annual = pd.DataFrame(annual_data, index=years)
df_annual.index.name = "Year"
print("Annual Freight Data (Million Tonnes):")
print(df_annual.head())
# ---------------------------
# Step 3: Convert annual to monthly
# ---------------------------
months = list(range(1, 13)) # 1=Jan, 12=Dec
records = []
for year in years:
for month in months:
for country in countries:
# Simple equal distribution across months
monthly_value = df_annual.loc[year, country] / 12
records.append({
"Year": year,
"Month": month,
"Country": country,
"Freight_Tonnes": round(monthly_value, 2)
})
df_monthly = pd.DataFrame(records)
# ---------------------------
# Step 4: Save to CSV
# ---------------------------
df_monthly.to_csv("monthly_freight_data.csv", index=False)
print("\nMonthly freight data saved to 'monthly_freight_data.csv'.")
print(df_monthly.head(12)) # show first year
Annual Freight Data (Million Tonnes):
India Japan Thailand South Africa United Arab Emirates Brazil \
Year
2005 152 358 320 365 409 264
2006 485 307 239 63 437 301
2007 398 393 495 291 51 239
2008 320 463 224 314 439 345
2009 156 343 495 395 103 262
Mexico Argentina
Year
2005 350 77
2006 114 184
2007 394 250
2008 376 377
2009 58 317
Monthly freight data saved to 'monthly_freight_data.csv'.
Year Month Country Freight_Tonnes
0 2005 1 India 12.67
1 2005 1 Japan 29.83
2 2005 1 Thailand 26.67
3 2005 1 South Africa 30.42
4 2005 1 United Arab Emirates 34.08
5 2005 1 Brazil 22.00
6 2005 1 Mexico 29.17
7 2005 1 Argentina 6.42
8 2005 2 India 12.67
9 2005 2 Japan 29.83
10 2005 2 Thailand 26.67
11 2005 2 South Africa 30.42
import pandas as pd
print(df_monthly.columns)
Index(['Year', 'Month', 'Country', 'Freight_Tonnes'], dtype='object')
# Load macro datasets
df_monthly.rename(columns={"Country": "country"}, inplace=True)
df_monthly.rename(columns={"Year": "year"}, inplace=True)
# Verify
print(df_monthly.columns)
Index(['year', 'Month', 'country', 'Freight_Tonnes'], dtype='object')
merged_df2 = pd.merge(merged_df1, df_monthly, on=["country", "year"], how="inner")
print(merged_df2.columns)
Index(['country', 'year', 'month', 'volume', 'ds', 'LD', 'MD', 'HD', 'Month_x',
'Annual_GDP_USD', 'Monthly_GDP_USD', 'GDP_Growth_%', 'Month_y',
'Year-Month', 'Petrol_USD_per_Liter', 'Diesel_USD_per_Liter', 'Month',
'Freight_Tonnes'],
dtype='object')
merged_df2
| country | year | month | volume | ds | LD | MD | HD | Month_x | Annual_GDP_USD | Monthly_GDP_USD | GDP_Growth_% | Month_y | Year-Month | Petrol_USD_per_Liter | Diesel_USD_per_Liter | Month | Freight_Tonnes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Argentina | 2005 | 1 | 6723 | 2005-01-01 | 3899 | 1815 | 1008 | 1 | 1.987371e+11 | 1.656142e+10 | 0.00 | 1 | 2005-01 | 0.784 | 0.760 | 1 | 6.42 |
| 1 | Argentina | 2005 | 1 | 6723 | 2005-01-01 | 3899 | 1815 | 1008 | 1 | 1.987371e+11 | 1.656142e+10 | 0.00 | 1 | 2005-01 | 0.784 | 0.760 | 2 | 6.42 |
| 2 | Argentina | 2005 | 1 | 6723 | 2005-01-01 | 3899 | 1815 | 1008 | 1 | 1.987371e+11 | 1.656142e+10 | 0.00 | 1 | 2005-01 | 0.784 | 0.760 | 3 | 6.42 |
| 3 | Argentina | 2005 | 1 | 6723 | 2005-01-01 | 3899 | 1815 | 1008 | 1 | 1.987371e+11 | 1.656142e+10 | 0.00 | 1 | 2005-01 | 0.784 | 0.760 | 4 | 6.42 |
| 4 | Argentina | 2005 | 1 | 6723 | 2005-01-01 | 3899 | 1815 | 1008 | 1 | 1.987371e+11 | 1.656142e+10 | 0.00 | 1 | 2005-01 | 0.784 | 0.760 | 5 | 6.42 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3151867 | United Arab Emirates | 2023 | 12 | 2700 | 2023-12-01 | 1215 | 810 | 675 | 12 | 5.226222e+11 | 4.355185e+10 | 2.19 | 12 | 2023-12 | 0.843 | 0.835 | 8 | 26.67 |
| 3151868 | United Arab Emirates | 2023 | 12 | 2700 | 2023-12-01 | 1215 | 810 | 675 | 12 | 5.226222e+11 | 4.355185e+10 | 2.19 | 12 | 2023-12 | 0.843 | 0.835 | 9 | 26.67 |
| 3151869 | United Arab Emirates | 2023 | 12 | 2700 | 2023-12-01 | 1215 | 810 | 675 | 12 | 5.226222e+11 | 4.355185e+10 | 2.19 | 12 | 2023-12 | 0.843 | 0.835 | 10 | 26.67 |
| 3151870 | United Arab Emirates | 2023 | 12 | 2700 | 2023-12-01 | 1215 | 810 | 675 | 12 | 5.226222e+11 | 4.355185e+10 | 2.19 | 12 | 2023-12 | 0.843 | 0.835 | 11 | 26.67 |
| 3151871 | United Arab Emirates | 2023 | 12 | 2700 | 2023-12-01 | 1215 | 810 | 675 | 12 | 5.226222e+11 | 4.355185e+10 | 2.19 | 12 | 2023-12 | 0.843 | 0.835 | 12 | 26.67 |
3151872 rows × 18 columns
merged_df2.to_csv("monthly_cv_with_macro.csv", index=False)
print("Final macro-enhanced dataset ready!")
Final macro-enhanced dataset ready!
import pandas as pd
df = pd.read_csv("monthly_cv_with_macro.csv")
print(df.columns)
Index(['country', 'year', 'month', 'volume', 'ds', 'LD', 'MD', 'HD', 'Month_x',
'Annual_GDP_USD', 'Monthly_GDP_USD', 'GDP_Growth_%', 'Month_y',
'Year-Month', 'Petrol_USD_per_Liter', 'Diesel_USD_per_Liter', 'Month',
'Freight_Tonnes'],
dtype='object')
import pandas as pd
import matplotlib.pyplot as plt
# Load dataset
df = pd.read_csv("monthly_cv_with_macro.csv")
# Clean column names (remove spaces)
df.columns = df.columns.str.strip()
# Create 'ds' column safely
if "ds" in df.columns:
df["ds"] = pd.to_datetime(df["ds"])
elif "Year-Month" in df.columns:
df["ds"] = pd.to_datetime(df["Year-Month"])
elif "year" in df.columns and "month" in df.columns:
df["ds"] = pd.to_datetime(
df["year"].astype(str) + "-" +
df["month"].astype(str) + "-01"
)
else:
raise ValueError("No valid date columns found to create 'ds'.")
# Sort by date
df = df.sort_values("ds")
# Display first few rows
print(df[["ds"]].head())
ds 0 2005-01-01 2759035 2005-01-01 2759036 2005-01-01 2759037 2005-01-01 2759038 2005-01-01
df = df.select_dtypes(include=["int64", "float64"])
print(df.columns)
Index(['year', 'month', 'volume', 'LD', 'MD', 'HD', 'Month_x',
'Annual_GDP_USD', 'Monthly_GDP_USD', 'GDP_Growth_%', 'Month_y',
'Petrol_USD_per_Liter', 'Diesel_USD_per_Liter', 'Month',
'Freight_Tonnes'],
dtype='object')
corr_matrix = df.corr()
print(corr_matrix)
year month volume LD \
year 1.000000e+00 -9.407075e-14 6.716271e-02 5.811650e-02
month -9.407075e-14 1.000000e+00 1.478750e-01 1.398144e-01
volume 6.716271e-02 1.478750e-01 1.000000e+00 9.938410e-01
LD 5.811650e-02 1.398144e-01 9.938410e-01 1.000000e+00
MD 8.177545e-02 1.624172e-01 9.776146e-01 9.517810e-01
HD 7.588563e-02 1.464776e-01 9.731829e-01 9.479829e-01
Month_x 3.091662e-15 -5.189453e-16 1.448175e-15 1.361161e-15
Annual_GDP_USD 9.828700e-02 4.394880e-15 7.449182e-01 7.831202e-01
Monthly_GDP_USD 9.828700e-02 3.861047e-15 7.449182e-01 7.831202e-01
GDP_Growth_% -1.824413e-01 -8.968962e-15 -5.785296e-02 -7.272643e-02
Month_y -7.179846e-16 -1.548459e-15 -6.072144e-16 -5.221993e-16
Petrol_USD_per_Liter 5.002585e-01 1.037861e-15 3.774744e-01 3.471699e-01
Diesel_USD_per_Liter 5.130251e-01 1.479408e-14 3.603355e-01 3.309951e-01
Month -2.202359e-15 1.742913e-15 -1.345312e-16 -1.167000e-16
Freight_Tonnes -8.216716e-02 1.651400e-15 -5.667378e-02 -4.298098e-02
MD HD Month_x \
year 8.177545e-02 7.588563e-02 3.091662e-15
month 1.624172e-01 1.464776e-01 -5.189453e-16
volume 9.776146e-01 9.731829e-01 1.448175e-15
LD 9.517810e-01 9.479829e-01 1.361161e-15
MD 1.000000e+00 9.656071e-01 1.523280e-15
HD 9.656071e-01 1.000000e+00 1.453158e-15
Month_x 1.523280e-15 1.453158e-15 1.000000e+00
Annual_GDP_USD 6.310239e-01 6.964473e-01 4.777533e-16
Monthly_GDP_USD 6.310239e-01 6.964473e-01 5.148199e-16
GDP_Growth_% -2.940483e-02 -3.600845e-02 -3.015675e-15
Month_y -7.365450e-16 -6.967437e-16 -9.926840e-18
Petrol_USD_per_Liter 4.168259e-01 4.074879e-01 -1.484332e-15
Diesel_USD_per_Liter 3.987040e-01 3.894411e-01 -1.395536e-15
Month -1.756672e-16 -1.352592e-16 -4.923977e-17
Freight_Tonnes -7.736052e-02 -7.543811e-02 -3.696758e-16
Annual_GDP_USD Monthly_GDP_USD GDP_Growth_% \
year 9.828700e-02 9.828700e-02 -1.824413e-01
month 4.394880e-15 3.861047e-15 -8.968962e-15
volume 7.449182e-01 7.449182e-01 -5.785296e-02
LD 7.831202e-01 7.831202e-01 -7.272643e-02
MD 6.310239e-01 6.310239e-01 -2.940483e-02
HD 6.964473e-01 6.964473e-01 -3.600845e-02
Month_x 4.777533e-16 5.148199e-16 -3.015675e-15
Annual_GDP_USD 1.000000e+00 1.000000e+00 -1.080069e-01
Monthly_GDP_USD 1.000000e+00 1.000000e+00 -1.080069e-01
GDP_Growth_% -1.080069e-01 -1.080069e-01 1.000000e+00
Month_y -1.220778e-15 -1.220759e-15 -3.439145e-15
Petrol_USD_per_Liter 3.165305e-01 3.165305e-01 -1.215996e-01
Diesel_USD_per_Liter 3.065903e-01 3.065903e-01 -1.261786e-01
Month -5.175397e-16 -5.173481e-16 1.748617e-15
Freight_Tonnes 7.581042e-03 7.581042e-03 -9.103985e-02
Month_y Petrol_USD_per_Liter \
year -7.179846e-16 5.002585e-01
month -1.548459e-15 1.037861e-15
volume -6.072144e-16 3.774744e-01
LD -5.221993e-16 3.471699e-01
MD -7.365450e-16 4.168259e-01
HD -6.967437e-16 4.074879e-01
Month_x -9.926840e-18 -1.484332e-15
Annual_GDP_USD -1.220778e-15 3.165305e-01
Monthly_GDP_USD -1.220759e-15 3.165305e-01
GDP_Growth_% -3.439145e-15 -1.215996e-01
Month_y 1.000000e+00 3.614679e-03
Petrol_USD_per_Liter 3.614679e-03 1.000000e+00
Diesel_USD_per_Liter 3.829066e-03 9.993229e-01
Month -6.309977e-19 -1.646487e-15
Freight_Tonnes -5.260776e-16 -9.796710e-02
Diesel_USD_per_Liter Month Freight_Tonnes
year 5.130251e-01 -2.202359e-15 -8.216716e-02
month 1.479408e-14 1.742913e-15 1.651400e-15
volume 3.603355e-01 -1.345312e-16 -5.667378e-02
LD 3.309951e-01 -1.167000e-16 -4.298098e-02
MD 3.987040e-01 -1.756672e-16 -7.736052e-02
HD 3.894411e-01 -1.352592e-16 -7.543811e-02
Month_x -1.395536e-15 -4.923977e-17 -3.696758e-16
Annual_GDP_USD 3.065903e-01 -5.175397e-16 7.581042e-03
Monthly_GDP_USD 3.065903e-01 -5.173481e-16 7.581042e-03
GDP_Growth_% -1.261786e-01 1.748617e-15 -9.103985e-02
Month_y 3.829066e-03 -6.309977e-19 -5.260776e-16
Petrol_USD_per_Liter 9.993229e-01 -1.646487e-15 -9.796710e-02
Diesel_USD_per_Liter 1.000000e+00 -1.686668e-15 -9.524807e-02
Month -1.686668e-15 1.000000e+00 1.752392e-15
Freight_Tonnes -9.524807e-02 1.752392e-15 1.000000e+00
plt.figure()
plt.imshow(corr_matrix)
plt.colorbar()
plt.xticks(range(len(corr_matrix.columns)), corr_matrix.columns, rotation=90)
plt.yticks(range(len(corr_matrix.columns)), corr_matrix.columns)
plt.title("Correlation Heatmap - CV & Macro Variables")
plt.show()
df.columns = df.columns.str.strip().str.lower()
# Step 1: Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
# Step 2: Load CSV
df = pd.read_csv('monthly_cv_with_macro.csv')
# Step 3: Inspect columns
print("Columns in CSV:", df.columns)
print(df.head())
# Step 4: Prepare features and target
# Target is 'volume'
target = 'volume'
# Features: year, month, Diesel, Petrol, Monthly GDP
features = ['year', 'month']
if 'Diesel_USD_per_Liter' in df.columns:
features.append('Diesel_USD_per_Liter')
if 'Petrol_USD_per_Liter' in df.columns:
features.append('Petrol_USD_per_Liter')
if 'Monthly_GDP_USD' in df.columns:
features.append('Monthly_GDP_USD')
X = df[features]
y = df[target]
# Step 5: Split into train and test (last 20% as test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
# Step 6: Train Linear Regression
model = LinearRegression()
model.fit(X_train, y_train)
# Step 7: Predict on test set
y_pred = model.predict(X_test)
# Step 8: Evaluate model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"MSE: {mse:.2f}, R2 Score: {r2:.2f}")
# Step 9: Forecast next 12 months (simple growth assumptions)
last_year = df['year'].max()
future_months = pd.DataFrame({
'year': [last_year + 1]*12,
'month': list(range(1,13))
})
# Add macro features with growth
for col in ['Diesel_USD_per_Liter', 'Petrol_USD_per_Liter', 'Monthly_GDP_USD']:
if col in df.columns:
last_values = df[col].iloc[-12:].values
# Simple growth assumption: 2% for Diesel/Petrol, 3% for GDP
growth = 1.03 if 'GDP' in col else 1.02
future_months[col] = last_values * growth
future_forecast = model.predict(future_months)
# Step 10: Plot actual, predicted, and forecast
plt.figure(figsize=(14,6))
plt.plot(df['volume'], label='Actual Volume')
plt.plot(range(len(X_train), len(X_train)+len(y_pred)), y_pred, label='Predicted Volume', linestyle='--')
plt.plot(range(len(df), len(df)+len(future_forecast)), future_forecast, label=f'Forecast {last_year + 1}', linestyle=':')
plt.xlabel('Time (Months)')
plt.ylabel('CV Volume')
plt.title('Monthly CV Volume Forecasting with Linear Regression')
plt.legend()
plt.show()
# Step 11: Combine forecast into DataFrame
forecast_df = future_months.copy()
forecast_df['Forecasted_Volume'] = future_forecast
print(forecast_df)
Columns in CSV: Index(['country', 'year', 'month', 'volume', 'ds', 'LD', 'MD', 'HD', 'Month_x',
'Annual_GDP_USD', 'Monthly_GDP_USD', 'GDP_Growth_%', 'Month_y',
'Year-Month', 'Petrol_USD_per_Liter', 'Diesel_USD_per_Liter', 'Month',
'Freight_Tonnes'],
dtype='object')
country year month volume ds LD MD HD Month_x \
0 Argentina 2005 1 6723 2005-01-01 3899 1815 1008 1
1 Argentina 2005 1 6723 2005-01-01 3899 1815 1008 1
2 Argentina 2005 1 6723 2005-01-01 3899 1815 1008 1
3 Argentina 2005 1 6723 2005-01-01 3899 1815 1008 1
4 Argentina 2005 1 6723 2005-01-01 3899 1815 1008 1
Annual_GDP_USD Monthly_GDP_USD GDP_Growth_% Month_y Year-Month \
0 1.987371e+11 1.656142e+10 0.0 1 2005-01
1 1.987371e+11 1.656142e+10 0.0 1 2005-01
2 1.987371e+11 1.656142e+10 0.0 1 2005-01
3 1.987371e+11 1.656142e+10 0.0 1 2005-01
4 1.987371e+11 1.656142e+10 0.0 1 2005-01
Petrol_USD_per_Liter Diesel_USD_per_Liter Month Freight_Tonnes
0 0.784 0.76 1 6.42
1 0.784 0.76 2 6.42
2 0.784 0.76 3 6.42
3 0.784 0.76 4 6.42
4 0.784 0.76 5 6.42
MSE: 208184519.55, R2 Score: 0.46
year month Diesel_USD_per_Liter Petrol_USD_per_Liter Monthly_GDP_USD \
0 2024 1 0.8517 0.85986 4.485840e+10
1 2024 2 0.8517 0.85986 4.485840e+10
2 2024 3 0.8517 0.85986 4.485840e+10
3 2024 4 0.8517 0.85986 4.485840e+10
4 2024 5 0.8517 0.85986 4.485840e+10
5 2024 6 0.8517 0.85986 4.485840e+10
6 2024 7 0.8517 0.85986 4.485840e+10
7 2024 8 0.8517 0.85986 4.485840e+10
8 2024 9 0.8517 0.85986 4.485840e+10
9 2024 10 0.8517 0.85986 4.485840e+10
10 2024 11 0.8517 0.85986 4.485840e+10
11 2024 12 0.8517 0.85986 4.485840e+10
Forecasted_Volume
0 -1250.420299
1 -90.540184
2 1069.339930
3 2229.220045
4 3389.100159
5 4548.980274
6 5708.860388
7 6868.740503
8 8028.620617
9 9188.500732
10 10348.380846
11 11508.260961
# Step 1: Import libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
# Step 2: Load CSV
df = pd.read_csv('monthly_cv_with_macro.csv')
# Step 3: Define target and features
target = 'volume'
macro_cols = []
for col in ['Diesel_USD_per_Liter', 'Petrol_USD_per_Liter', 'Monthly_GDP_USD']:
if col in df.columns:
macro_cols.append(col)
features_base = ['year', 'month'] + macro_cols
# Step 4: Prepare forecast DataFrame
forecast_all = pd.DataFrame()
# Step 5: Loop over each country
for country in df['country'].unique():
df_country = df[df['country'] == country].copy()
X = df_country[features_base]
y = df_country[target]
# Train linear regression
model = LinearRegression()
model.fit(X, y)
# Forecast months for 2026-2030
last_year = df_country['year'].max()
years_forecast = list(range(last_year+1, 2031)) # 2026 to 2030
months_forecast = list(range(1,13))
future_dates = pd.DataFrame({
'year': np.repeat(years_forecast, 12),
'month': months_forecast * len(years_forecast)
})
# Add macro columns with growth assumptions
for col in macro_cols:
last_values = df_country[col].iloc[-12:].values
growth = 1.03 if 'GDP' in col else 1.02
future_values = []
for i in range(len(years_forecast)):
future_values.extend(last_values * (growth**(i+1))) # cumulative growth per year
future_dates[col] = future_values
# Predict monthly volumes
future_dates['Forecasted_Volume'] = model.predict(future_dates[features_base])
future_dates['country'] = country
# Aggregate to yearly volume
yearly_forecast = future_dates.groupby('year')['Forecasted_Volume'].sum().reset_index()
yearly_forecast['country'] = country
forecast_all = pd.concat([forecast_all, yearly_forecast], ignore_index=True)
# Step 6: Plot country-wise yearly forecast
plt.figure(figsize=(14,7))
for country in forecast_all['country'].unique():
df_plot = forecast_all[forecast_all['country'] == country]
plt.plot(df_plot['year'], df_plot['Forecasted_Volume'], marker='o', label=country)
plt.title('Country-wise Yearly CV Volume Forecast (2026–2030)')
plt.xlabel('Year')
plt.ylabel('Forecasted CV Volume')
plt.xticks(years_forecast)
plt.grid(True)
plt.legend()
plt.show()
# Step 7: Save forecast to CSV
forecast_all.to_csv('country_yearly_cv_forecast_2026_2030.csv', index=False)
print("Yearly forecast saved to 'country_yearly_cv_forecast_2026_2030.csv'")
Yearly forecast saved to 'country_yearly_cv_forecast_2026_2030.csv'
# Step 1: Import libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
# Step 2: Load CSV
df = pd.read_csv('monthly_cv_with_macro.csv')
# Step 3: Define target and features
target = 'volume'
macro_cols = []
for col in ['Diesel_USD_per_Liter', 'Petrol_USD_per_Liter', 'Monthly_GDP_USD']:
if col in df.columns:
macro_cols.append(col)
features_base = ['year', 'month'] + macro_cols
# Step 4: Prepare forecast DataFrame
forecast_all = pd.DataFrame()
# Step 5: Loop over each country for monthly forecast
for country in df['country'].unique():
df_country = df[df['country'] == country].copy()
X = df_country[features_base]
y = df_country[target]
# Train Linear Regression
model = LinearRegression()
model.fit(X, y)
# Forecast months for 2026–2030
last_year = df_country['year'].max()
years_forecast = list(range(last_year+1, 2031)) # 2026–2030
months_forecast = list(range(1,13))
future_dates = pd.DataFrame({
'year': np.repeat(years_forecast, 12),
'month': months_forecast * len(years_forecast)
})
# Add macro columns with growth assumptions
for col in macro_cols:
last_values = df_country[col].iloc[-12:].values
growth = 1.03 if 'GDP' in col else 1.02
future_values = []
for i in range(len(years_forecast)):
future_values.extend(last_values * (growth**(i+1)))
future_dates[col] = future_values
# Predict monthly volumes
future_dates['Forecasted_Volume'] = model.predict(future_dates[features_base])
future_dates['country'] = country
forecast_all = pd.concat([forecast_all, future_dates[['country','year','Forecasted_Volume']]], ignore_index=True)
# Step 6: Aggregate to yearly totals
yearly_forecast = forecast_all.groupby(['year','country'])['Forecasted_Volume'].sum().reset_index()
# Step 7: Pivot for plotting
plot_data = yearly_forecast.pivot(index='year', columns='country', values='Forecasted_Volume').fillna(0)
# Step 8: Plot grouped bar chart
plot_data.plot(kind='bar', figsize=(14,7))
plt.title('Country-wise Yearly CV Volume Forecast (2026–2030)')
plt.xlabel('Year')
plt.ylabel('Forecasted CV Volume')
plt.xticks(rotation=0)
plt.legend(title='Country')
plt.grid(axis='y')
plt.show()
# ==========================================================
# CLEAN WORKING SARIMAX FORECAST
# ==========================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.statespace.sarimax import SARIMAX
import warnings
warnings.filterwarnings("ignore")
# -----------------------------
# 1️⃣ SETTINGS
# -----------------------------
FILE_PATH = "monthly_cv_with_macro.csv"
COUNTRY = "India"
FORECAST_START = "2026-01-01"
FORECAST_END = "2030-12-01"
EXOG_COLS = [
"Diesel_USD_per_Liter",
"Petrol_USD_per_Liter",
"Monthly_GDP_USD"
]
# -----------------------------
# 2️⃣ LOAD DATA
# -----------------------------
df = pd.read_csv(FILE_PATH)
# Filter country
df = df[df["country"] == COUNTRY].copy()
# Convert to datetime
df["Year-Month"] = pd.to_datetime(df["Year-Month"])
# -----------------------------
# 3️⃣ REMOVE DUPLICATES SAFELY
# -----------------------------
df = (
df.groupby("Year-Month")
.agg({
"volume": "sum",
"Diesel_USD_per_Liter": "mean",
"Petrol_USD_per_Liter": "mean",
"Monthly_GDP_USD": "mean"
})
.reset_index()
)
# Set index
df = df.sort_values("Year-Month")
df.set_index("Year-Month", inplace=True)
# -----------------------------
# 4️⃣ ENSURE MONTHLY FREQUENCY
# -----------------------------
df = df.asfreq("MS")
# Fill missing months if any
df = df.interpolate()
# -----------------------------
# 5️⃣ DEFINE TARGET & EXOG
# -----------------------------
y = df["volume"]
exog = df[EXOG_COLS]
# -----------------------------
# 6️⃣ BUILD MODEL
# -----------------------------
model = SARIMAX(
y,
exog=exog,
order=(1,1,1),
seasonal_order=(1,1,1,12),
enforce_stationarity=False,
enforce_invertibility=False
)
model_fit = model.fit(disp=False)
print(model_fit.summary())
# -----------------------------
# 7️⃣ CREATE FUTURE DATES
# -----------------------------
future_index = pd.date_range(
start=FORECAST_START,
end=FORECAST_END,
freq="MS"
)
# -----------------------------
# 8️⃣ FUTURE EXOG ASSUMPTIONS
# -----------------------------
last_12 = df[EXOG_COLS].iloc[-12:].values
future_exog = []
years = len(future_index) // 12
for i in range(years):
for j in range(12):
diesel = last_12[j, 0] * (1.02 ** (i+1))
petrol = last_12[j, 1] * (1.02 ** (i+1))
gdp = last_12[j, 2] * (1.03 ** (i+1))
future_exog.append([diesel, petrol, gdp])
future_exog = pd.DataFrame(
future_exog,
columns=EXOG_COLS,
index=future_index
)
# -----------------------------
# 9️⃣ FORECAST
# -----------------------------
forecast = model_fit.get_forecast(
steps=len(future_index),
exog=future_exog
)
forecast_mean = forecast.predicted_mean
forecast_ci = forecast.conf_int()
# -----------------------------
# 🔟 PLOT
# -----------------------------
plt.figure(figsize=(14,6))
plt.plot(y, label="Historical")
plt.plot(forecast_mean, label="Forecast 2026-2030")
plt.fill_between(
forecast_ci.index,
forecast_ci.iloc[:,0],
forecast_ci.iloc[:,1],
alpha=0.3
)
plt.title(f"Monthly CV Volume Forecast - {COUNTRY}")
plt.xlabel("Year-Month")
plt.ylabel("Volume")
plt.legend()
plt.grid(True)
plt.show()
# -----------------------------
# 1️⃣1️⃣ SAVE OUTPUT
# -----------------------------
forecast_df = pd.DataFrame({
"Year-Month": future_index,
"Forecasted_Volume": forecast_mean.values,
"Lower_CI": forecast_ci.iloc[:,0].values,
"Upper_CI": forecast_ci.iloc[:,1].values
})
forecast_df.to_csv(f"{COUNTRY}_forecast_2026_2030.csv", index=False)
print("Forecast saved successfully.")
print(forecast_df.head())
SARIMAX Results
==========================================================================================
Dep. Variable: volume No. Observations: 228
Model: SARIMAX(1, 1, 1)x(1, 1, 1, 12) Log Likelihood -3395.110
Date: Fri, 20 Feb 2026 AIC 6806.219
Time: 00:20:46 BIC 6832.646
Sample: 01-01-2005 HQIC 6816.913
- 12-01-2023
Covariance Type: opg
========================================================================================
coef std err z P>|z| [0.025 0.975]
----------------------------------------------------------------------------------------
Diesel_USD_per_Liter -5.232e+08 4.63e-08 -1.13e+16 0.000 -5.23e+08 -5.23e+08
Petrol_USD_per_Liter 5.089e+08 4.81e-08 1.06e+16 0.000 5.09e+08 5.09e+08
Monthly_GDP_USD 0.0008 2.2e-05 34.271 0.000 0.001 0.001
ar.L1 0.7242 0.690 1.049 0.294 -0.628 2.077
ma.L1 -0.7008 0.761 -0.921 0.357 -2.192 0.791
ar.S.L12 -0.1070 0.043 -2.460 0.014 -0.192 -0.022
ma.S.L12 -1.0293 0.035 -29.540 0.000 -1.098 -0.961
sigma2 4.244e+13 1.93e-14 2.2e+27 0.000 4.24e+13 4.24e+13
===================================================================================
Ljung-Box (L1) (Q): 0.08 Jarque-Bera (JB): 8497.94
Prob(Q): 0.77 Prob(JB): 0.00
Heteroskedasticity (H): 5.28 Skew: -1.60
Prob(H) (two-sided): 0.00 Kurtosis: 34.69
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
[2] Covariance matrix is singular or near-singular, with condition number 3.09e+43. Standard errors may be unstable.
Forecast saved successfully. Year-Month Forecasted_Volume Lower_CI Upper_CI 0 2026-01-01 1.447715e+08 1.320028e+08 1.575401e+08 1 2026-02-01 1.414410e+08 1.231709e+08 1.597111e+08 2 2026-03-01 1.395686e+08 1.169798e+08 1.621574e+08 3 2026-04-01 1.382867e+08 1.120017e+08 1.645716e+08 4 2026-05-01 1.372783e+08 1.077046e+08 1.668521e+08
# Step 1: Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.statespace.sarimax import SARIMAX
# Step 2: Load CSV
df = pd.read_csv('monthly_cv_with_macro.csv')
# Step 3: Filter for India
df_india = df[df['country'] == 'India'].copy()
# Step 4: Convert 'Year-Month' to datetime
df_india['Year-Month'] = pd.to_datetime(df_india['Year-Month'])
# Step 5: Aggregate duplicate months (sum volume, average macro)
df_india = df_india.groupby('Year-Month').agg({
'volume': 'sum',
'Diesel_USD_per_Liter': 'mean',
'Petrol_USD_per_Liter': 'mean',
'Monthly_GDP_USD': 'mean'
}).reset_index()
# Step 6: Sort by date
df_india = df_india.sort_values('Year-Month')
# Step 7: Set index and frequency
df_india.set_index('Year-Month', inplace=True)
df_india.index.freq = 'MS' # monthly start
# Step 8: Handle missing values
df_india['volume'] = df_india['volume'].interpolate() # linear interpolation for volume
for col in ['Diesel_USD_per_Liter', 'Petrol_USD_per_Liter', 'Monthly_GDP_USD']:
df_india[col] = df_india[col].ffill() # forward fill macro variables
# Step 9: Prepare target and exogenous variables
y = df_india['volume']
exog_cols = ['Diesel_USD_per_Liter', 'Petrol_USD_per_Liter', 'Monthly_GDP_USD']
exog = df_india[exog_cols]
# Step 10: Define SARIMAX model
model = SARIMAX(
y,
exog=exog,
order=(1,1,0), # ARIMA(p,d,q)
seasonal_order=(1,0,0,12), # Seasonal (P,D,Q,s)
enforce_stationarity=False,
enforce_invertibility=False
)
# Step 11: Fit SARIMAX
model_fit = model.fit(disp=False)
print(model_fit.summary())
# Step 12: Forecast 2026–2030 (60 months)
future_index = pd.date_range(start='2026-01-01', end='2030-12-01', freq='MS')
# Generate future exogenous variables with growth assumptions
last_12_months = df_india[exog_cols].iloc[-12:].values
future_exog = []
for i in range(5): # 5 years
for j in range(12): # 12 months
diesel = last_12_months[j,0] * (1.02**(i+1)) # 2% annual growth
petrol = last_12_months[j,1] * (1.02**(i+1))
gdp = last_12_months[j,2] * (1.03**(i+1)) # 3% GDP growth
future_exog.append([diesel, petrol, gdp])
future_exog = pd.DataFrame(future_exog, columns=exog_cols, index=future_index)
# Step 13: Get forecast
forecast = model_fit.get_forecast(steps=len(future_index), exog=future_exog)
forecast_mean = forecast.predicted_mean
forecast_ci = forecast.conf_int()
# Step 14: Plot historical + forecast
plt.figure(figsize=(14,6))
plt.plot(y.index, y.values, label='Historical Volume', color='blue')
plt.plot(forecast_mean.index, forecast_mean.values, label='Forecast 2026-2030', color='red')
plt.fill_between(forecast_ci.index, forecast_ci.iloc[:,0], forecast_ci.iloc[:,1], color='pink', alpha=0.3, label='Confidence Interval')
plt.title('Monthly CV Volume Forecast for India (SARIMAX)')
plt.xlabel('Year-Month')
plt.ylabel('CV Volume')
plt.legend()
plt.grid(True)
plt.show()
# Step 15: Save forecast to CSV
forecast_df = pd.DataFrame({
'Year-Month': future_index,
'Forecasted_Volume': forecast_mean.values
})
forecast_df.to_csv('India_monthly_cv_forecast_2026_2030.csv', index=False)
print("Forecast saved to 'India_monthly_cv_forecast_2026_2030.csv'")
# Step 16: Optional: print forecast
print(forecast_df)
SARIMAX Results
==========================================================================================
Dep. Variable: volume No. Observations: 228
Model: SARIMAX(1, 1, 0)x(1, 0, 0, 12) Log Likelihood -3598.965
Date: Fri, 20 Feb 2026 AIC 7209.931
Time: 00:21:13 BIC 7230.126
Sample: 01-01-2005 HQIC 7218.092
- 12-01-2023
Covariance Type: opg
========================================================================================
coef std err z P>|z| [0.025 0.975]
----------------------------------------------------------------------------------------
Diesel_USD_per_Liter -3.746e+08 1.41e-07 -2.65e+15 0.000 -3.75e+08 -3.75e+08
Petrol_USD_per_Liter 3.608e+08 1.46e-07 2.46e+15 0.000 3.61e+08 3.61e+08
Monthly_GDP_USD 0.0007 3.29e-05 20.217 0.000 0.001 0.001
ar.L1 -0.0032 1.724 -0.002 0.999 -3.382 3.376
ar.S.L12 0.1255 0.021 5.992 0.000 0.084 0.166
sigma2 2.313e+13 1.64e-12 1.41e+25 0.000 2.31e+13 2.31e+13
===================================================================================
Ljung-Box (L1) (Q): 0.00 Jarque-Bera (JB): 9396.76
Prob(Q): 0.95 Prob(JB): 0.00
Heteroskedasticity (H): 9.20 Skew: -2.77
Prob(H) (two-sided): 0.00 Kurtosis: 34.99
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
[2] Covariance matrix is singular or near-singular, with condition number 7.02e+39. Standard errors may be unstable.
Forecast saved to 'India_monthly_cv_forecast_2026_2030.csv' Year-Month Forecasted_Volume 0 2026-01-01 1.456731e+08 1 2026-02-01 1.458136e+08 2 2026-03-01 1.458297e+08 3 2026-04-01 1.458933e+08 4 2026-05-01 1.460044e+08 5 2026-06-01 1.456674e+08 6 2026-07-01 1.455193e+08 7 2026-08-01 1.456181e+08 8 2026-09-01 1.456551e+08 9 2026-10-01 1.458069e+08 10 2026-11-01 1.456446e+08 11 2026-12-01 1.456921e+08 12 2027-01-01 1.517364e+08 13 2027-02-01 1.518974e+08 14 2027-03-01 1.519159e+08 15 2027-04-01 1.519887e+08 16 2027-05-01 1.521159e+08 17 2027-06-01 1.517299e+08 18 2027-07-01 1.515603e+08 19 2027-08-01 1.516734e+08 20 2027-09-01 1.517158e+08 21 2027-10-01 1.518897e+08 22 2027-11-01 1.517038e+08 23 2027-12-01 1.517582e+08 24 2028-01-01 1.581240e+08 25 2028-02-01 1.582905e+08 26 2028-03-01 1.583095e+08 27 2028-04-01 1.583848e+08 28 2028-05-01 1.585163e+08 29 2028-06-01 1.581174e+08 30 2028-07-01 1.579420e+08 31 2028-08-01 1.580589e+08 32 2028-09-01 1.581027e+08 33 2028-10-01 1.582825e+08 34 2028-11-01 1.580904e+08 35 2028-12-01 1.581466e+08 36 2029-01-01 1.647213e+08 37 2029-02-01 1.648913e+08 38 2029-03-01 1.649108e+08 39 2029-04-01 1.649877e+08 40 2029-05-01 1.651221e+08 41 2029-06-01 1.647145e+08 42 2029-07-01 1.645353e+08 43 2029-08-01 1.646547e+08 44 2029-09-01 1.646995e+08 45 2029-10-01 1.648832e+08 46 2029-11-01 1.646869e+08 47 2029-12-01 1.647443e+08 48 2030-01-01 1.715188e+08 49 2030-02-01 1.716923e+08 50 2030-03-01 1.717122e+08 51 2030-04-01 1.717907e+08 52 2030-05-01 1.719277e+08 53 2030-06-01 1.715119e+08 54 2030-07-01 1.713291e+08 55 2030-08-01 1.714509e+08 56 2030-09-01 1.714966e+08 57 2030-10-01 1.716840e+08 58 2030-11-01 1.714837e+08 59 2030-12-01 1.715423e+08
# Step 1: Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.statespace.sarimax import SARIMAX
# Step 2: Load CSV
df = pd.read_csv('monthly_cv_with_macro.csv')
# Step 3: List of countries
countries = df['country'].unique()
# Step 4: Prepare empty DataFrame for all forecasts
all_forecasts = pd.DataFrame()
# Step 5: Loop through each country
for country in countries:
print(f"Processing {country}...")
# Filter for country
df_country = df[df['country'] == country].copy()
# Convert Year-Month to datetime
df_country['Year-Month'] = pd.to_datetime(df_country['Year-Month'])
# Aggregate duplicates
df_country = df_country.groupby('Year-Month').agg({
'volume': 'sum',
'Diesel_USD_per_Liter': 'mean',
'Petrol_USD_per_Liter': 'mean',
'Monthly_GDP_USD': 'mean'
}).reset_index()
# Sort by date
df_country = df_country.sort_values('Year-Month')
# Set index and monthly frequency
df_country.set_index('Year-Month', inplace=True)
df_country.index.freq = 'MS'
# Handle missing values
df_country['volume'] = df_country['volume'].interpolate()
for col in ['Diesel_USD_per_Liter', 'Petrol_USD_per_Liter', 'Monthly_GDP_USD']:
df_country[col] = df_country[col].ffill()
# Prepare target and exogenous variables
y = df_country['volume']
exog_cols = ['Diesel_USD_per_Liter', 'Petrol_USD_per_Liter', 'Monthly_GDP_USD']
exog = df_country[exog_cols]
# Fit SARIMAX
model = SARIMAX(
y,
exog=exog,
order=(1,1,0),
seasonal_order=(1,0,0,12),
enforce_stationarity=False,
enforce_invertibility=False
)
model_fit = model.fit(disp=False)
# Forecast 2026–2030
future_index = pd.date_range(start='2026-01-01', end='2030-12-01', freq='MS')
# Create future exog variables
last_12_months = df_country[exog_cols].iloc[-12:].values
future_exog = []
for i in range(5): # 5 years
for j in range(12):
diesel = last_12_months[j,0] * (1.02**(i+1))
petrol = last_12_months[j,1] * (1.02**(i+1))
gdp = last_12_months[j,2] * (1.03**(i+1))
future_exog.append([diesel, petrol, gdp])
future_exog = pd.DataFrame(future_exog, columns=exog_cols, index=future_index)
# Generate forecast
forecast = model_fit.get_forecast(steps=len(future_index), exog=future_exog)
forecast_mean = forecast.predicted_mean
forecast_ci = forecast.conf_int()
# Create forecast DataFrame
forecast_df = pd.DataFrame({
'country': country,
'Year-Month': future_index,
'Forecasted_Volume': forecast_mean.values
})
# Append to all_forecasts
all_forecasts = pd.concat([all_forecasts, forecast_df])
# Plot historical + forecast
plt.figure(figsize=(12,5))
plt.plot(y.index, y.values, label='Historical', color='blue')
plt.plot(forecast_mean.index, forecast_mean.values, label='Forecast 2026-2030', color='red')
plt.fill_between(forecast_ci.index, forecast_ci.iloc[:,0], forecast_ci.iloc[:,1], color='pink', alpha=0.3)
plt.title(f'{country} Monthly CV Volume Forecast (SARIMAX)')
plt.xlabel('Year-Month')
plt.ylabel('CV Volume')
plt.legend()
plt.grid(True)
plt.show()
# Step 6: Save all forecasts to CSV
all_forecasts.to_csv('All_Countries_Monthly_CV_Forecast_2026_2030.csv', index=False)
print("All country forecasts saved to 'All_Countries_Monthly_CV_Forecast_2026_2030.csv'")
Processing Argentina...
Processing Brazil...
Processing India...
Processing Japan...
Processing Mexico...
Processing South Africa...
Processing Thailand...
Processing United Arab Emirates...
All country forecasts saved to 'All_Countries_Monthly_CV_Forecast_2026_2030.csv'
# Step 1: Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.statespace.sarimax import SARIMAX
# Step 2: Load CSV
df = pd.read_csv('monthly_cv_with_macro.csv')
# Step 3: List of countries and segments
countries = df['country'].unique()
segments = ['LD', 'MD', 'HD']
# Step 4: Prepare empty DataFrame for all forecasts
all_forecasts = pd.DataFrame()
# Step 5: Loop through each country and segment
for country in countries:
print(f"Processing {country}...")
df_country = df[df['country'] == country].copy()
# Convert Year-Month to datetime
df_country['Year-Month'] = pd.to_datetime(df_country['Year-Month'])
# Loop through segments
for seg in segments:
# Aggregate duplicates for this segment
df_seg = df_country.groupby('Year-Month').agg({
seg: 'sum',
'Diesel_USD_per_Liter': 'mean',
'Petrol_USD_per_Liter': 'mean',
'Monthly_GDP_USD': 'mean'
}).reset_index()
# Sort and set index
df_seg = df_seg.sort_values('Year-Month')
df_seg.set_index('Year-Month', inplace=True)
df_seg.index.freq = 'MS'
# Handle missing values
df_seg[seg] = df_seg[seg].interpolate()
for col in ['Diesel_USD_per_Liter', 'Petrol_USD_per_Liter', 'Monthly_GDP_USD']:
df_seg[col] = df_seg[col].ffill()
# Prepare target and exogenous
y = df_seg[seg]
exog_cols = ['Diesel_USD_per_Liter', 'Petrol_USD_per_Liter', 'Monthly_GDP_USD']
exog = df_seg[exog_cols]
# Fit SARIMAX
model = SARIMAX(
y,
exog=exog,
order=(1,1,0),
seasonal_order=(1,0,0,12),
enforce_stationarity=False,
enforce_invertibility=False
)
model_fit = model.fit(disp=False)
# Forecast 2026–2030
future_index = pd.date_range(start='2026-01-01', end='2030-12-01', freq='MS')
last_12_months = df_seg[exog_cols].iloc[-12:].values
future_exog = []
for i in range(5): # 5 years
for j in range(12): # 12 months
diesel = last_12_months[j,0] * (1.02**(i+1))
petrol = last_12_months[j,1] * (1.02**(i+1))
gdp = last_12_months[j,2] * (1.03**(i+1))
future_exog.append([diesel, petrol, gdp])
future_exog = pd.DataFrame(future_exog, columns=exog_cols, index=future_index)
forecast = model_fit.get_forecast(steps=len(future_index), exog=future_exog)
forecast_mean = forecast.predicted_mean
forecast_ci = forecast.conf_int()
# Prepare forecast DataFrame
forecast_df = pd.DataFrame({
'country': country,
'segment': seg,
'Year-Month': future_index,
'Forecasted_Volume': forecast_mean.values
})
# Append to all_forecasts
all_forecasts = pd.concat([all_forecasts, forecast_df])
# Plot for each segment
plt.figure(figsize=(12,5))
plt.plot(y.index, y.values, label='Historical', color='blue')
plt.plot(forecast_mean.index, forecast_mean.values, label='Forecast 2026-2030', color='red')
plt.fill_between(forecast_ci.index, forecast_ci.iloc[:,0], forecast_ci.iloc[:,1], color='pink', alpha=0.3)
plt.title(f'{country} {seg} Segment Monthly CV Forecast (SARIMAX)')
plt.xlabel('Year-Month')
plt.ylabel('CV Volume')
plt.legend()
plt.grid(True)
plt.show()
# Step 6: Save all forecasts to CSV
all_forecasts.to_csv('All_Countries_LD_MD_HD_CV_Forecast_2026_2030.csv', index=False)
print("Forecast for LD, MD, HD segments saved to 'All_Countries_LD_MD_HD_CV_Forecast_2026_2030.csv'")
Processing Argentina...
Processing Brazil...
Processing India...
Processing Japan...
Processing Mexico...
Processing South Africa...
Processing Thailand...
Processing United Arab Emirates...
Forecast for LD, MD, HD segments saved to 'All_Countries_LD_MD_HD_CV_Forecast_2026_2030.csv'
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
# Load dataset
df = pd.read_csv('monthly_cv_with_macro.csv')
# Macro growth assumptions
diesel_growth = 0.01
petrol_growth = 0.01
gdp_growth = 0.02
forecast_years = range(2026, 2031)
all_countries_forecast = []
for country in df['country'].unique():
df_country = df[df['country']==country].copy()
# Fill missing values
for col in ['LD','MD','HD','Diesel_USD_per_Liter','Petrol_USD_per_Liter','Monthly_GDP_USD']:
df_country[col] = df_country[col].interpolate().ffill()
df_country['Total'] = df_country['LD'] + df_country['MD'] + df_country['HD']
# Average of last 6 months for features
last_6 = df_country[['LD','MD','HD','Total','Diesel_USD_per_Liter','Petrol_USD_per_Liter','Monthly_GDP_USD']].iloc[-6:].mean()
# Yearly aggregated historical data for training
df_country['Year'] = pd.to_datetime(df_country['Year-Month']).dt.year
yearly_data = df_country.groupby('Year')[['LD','MD','HD','Total','Diesel_USD_per_Liter','Petrol_USD_per_Liter','Monthly_GDP_USD']].mean().reset_index()
X_train = yearly_data[['LD','MD','HD','Total','Diesel_USD_per_Liter','Petrol_USD_per_Liter','Monthly_GDP_USD']]
y_train = yearly_data[['LD','MD','HD']]
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
# Forecast future years
for y in forecast_years:
diesel = last_6['Diesel_USD_per_Liter'] * (1 + diesel_growth) ** (y - 2025)
petrol = last_6['Petrol_USD_per_Liter'] * (1 + petrol_growth) ** (y - 2025)
gdp = last_6['Monthly_GDP_USD'] * (1 + gdp_growth) ** (y - 2025)
# Use DataFrame with column names to avoid sklearn warning
X_pred = pd.DataFrame([[last_6['LD'], last_6['MD'], last_6['HD'], last_6['Total'], diesel, petrol, gdp]],
columns=X_train.columns)
pred = rf.predict(X_pred)[0]
total = sum(pred)
all_countries_forecast.append([country, y, int(round(pred[0])), int(round(pred[1])), int(round(pred[2])), int(round(total))])
# Create final DataFrame
forecast_df = pd.DataFrame(all_countries_forecast, columns=['Country','Year','LD','MD','HD','Total'])
print(forecast_df)
# Save to CSV
forecast_df.to_csv('RF_Countrywise_Yearly_Forecast_2026_2030.csv', index=False)
print("Forecast saved to 'RF_Countrywise_Yearly_Forecast_2026_2030.csv'")
Country Year LD MD HD Total 0 Argentina 2026 8085 3764 2091 13940 1 Argentina 2027 8085 3764 2091 13940 2 Argentina 2028 8085 3764 2091 13940 3 Argentina 2029 8085 3764 2091 13940 4 Argentina 2030 8085 3764 2091 13940 5 Brazil 2026 27947 13010 7228 48185 6 Brazil 2027 27947 13010 7228 48185 7 Brazil 2028 28309 13178 7321 48809 8 Brazil 2029 28824 13418 7455 49697 9 Brazil 2030 28824 13418 7455 49697 10 India 2026 42783 19447 15557 77787 11 India 2027 43446 19748 15799 78993 12 India 2028 43333 19697 15758 78788 13 India 2029 43333 19697 15758 78788 14 India 2030 43333 19697 15758 78788 15 Japan 2026 40782 12548 9411 62741 16 Japan 2027 40782 12548 9411 62741 17 Japan 2028 40782 12548 9411 62741 18 Japan 2029 40782 12548 9411 62741 19 Japan 2030 40782 12548 9411 62741 20 Mexico 2026 35956 16738 9299 61993 21 Mexico 2027 35807 16669 9260 61737 22 Mexico 2028 35967 16743 9302 62013 23 Mexico 2029 35967 16743 9302 62013 24 Mexico 2030 35967 16743 9302 62013 25 South Africa 2026 7478 4487 2991 14956 26 South Africa 2027 7478 4487 2991 14957 27 South Africa 2028 7478 4487 2991 14957 28 South Africa 2029 7478 4487 2991 14956 29 South Africa 2030 7470 4482 2988 14939 30 Thailand 2026 17760 7400 4440 29600 31 Thailand 2027 17760 7400 4440 29600 32 Thailand 2028 17702 7376 4425 29503 33 Thailand 2029 21943 9143 5486 36572 34 Thailand 2030 21799 9083 5450 36331 35 United Arab Emirates 2026 1225 817 681 2723 36 United Arab Emirates 2027 1221 814 678 2712 37 United Arab Emirates 2028 1217 811 676 2704 38 United Arab Emirates 2029 1217 811 676 2704 39 United Arab Emirates 2030 1217 811 676 2704 Forecast saved to 'RF_Countrywise_Yearly_Forecast_2026_2030.csv'
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Load forecast CSV
forecast_df = pd.read_csv('RF_Countrywise_Yearly_Forecast_2026_2030.csv')
# Ensure numeric
for col in ['LD','MD','HD','Total']:
forecast_df[col] = pd.to_numeric(forecast_df[col])
# Melt dataframe for seaborn plotting (long format)
df_melt = forecast_df.melt(id_vars=['Country','Year'],
value_vars=['LD','MD','HD','Total'],
var_name='Segment', value_name='Volume')
# Set style
sns.set(style="whitegrid")
# Create FacetGrid: one subplot per country
g = sns.FacetGrid(df_melt, col="Country", col_wrap=3, height=4, sharey=False)
g.map_dataframe(sns.barplot, x='Year', y='Volume', hue='Segment', palette='Set2', ci=None)
g.add_legend()
g.set_titles("{col_name}")
g.set_axis_labels("Year", "Volume")
# Rotate x-axis labels for readability
for ax in g.axes.flat:
for label in ax.get_xticklabels():
label.set_rotation(45)
plt.tight_layout()
plt.show()
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# ==========================================================
# UNIVERSAL FORECAST EVALUATION FUNCTION
# ==========================================================
def evaluate_model(y_true, y_pred, model_name="Model", n_features=None):
y_true = pd.Series(y_true).reset_index(drop=True)
y_pred = pd.Series(y_pred).reset_index(drop=True)
mask = ~(y_true.isna() | y_pred.isna())
y_true = y_true[mask]
y_pred = y_pred[mask]
n = len(y_true)
mae = mean_absolute_error(y_true, y_pred)
mse = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)
bias = np.mean(y_pred - y_true)
non_zero = y_true != 0
mape = np.mean(np.abs((y_true[non_zero] - y_pred[non_zero]) / y_true[non_zero])) * 100
smape = np.mean(
2 * np.abs(y_pred - y_true) /
(np.abs(y_true) + np.abs(y_pred))
) * 100
r2 = r2_score(y_true, y_pred)
if n_features is not None and n > n_features + 1:
adj_r2 = 1 - ((1 - r2) * (n - 1) / (n - n_features - 1))
else:
adj_r2 = np.nan
return {
"Model": model_name,
"MAE": mae,
"RMSE": rmse,
"MSE": mse,
"MAPE(%)": mape,
"SMAPE(%)": smape,
"R2": r2,
"Adj_R2": adj_r2,
"Bias": bias
}
# ==========================================================
# EXAMPLE USAGE
# ==========================================================
y_true = [100, 120, 130, 150, 170]
y_pred1 = [110, 115, 128, 160, 165]
y_pred2 = [108, 118, 132, 155, 168]
model1 = evaluate_model(y_true, y_pred1, "SARIMAX", 2)
model2 = evaluate_model(y_true, y_pred2, "LinearReg", 2)
final_results = pd.DataFrame([model1, model2])
final_results = final_results.sort_values(by="RMSE")
# ==========================================================
# PRINT IN SINGLE LINE FORMAT (COMPACT & ALIGNED)
# ==========================================================
print("\n📊 MODEL PERFORMANCE SUMMARY\n")
print(f"{'Model':<12} {'MAE':>8} {'RMSE':>8} {'MSE':>10} "
f"{'MAPE%':>8} {'SMAPE%':>8} {'R2':>8} {'AdjR2':>8} {'Bias':>8}")
print("-" * 85)
for _, row in final_results.iterrows():
print(f"{row['Model']:<12} "
f"{row['MAE']:>8.2f} "
f"{row['RMSE']:>8.2f} "
f"{row['MSE']:>10.2f} "
f"{row['MAPE(%)']:>8.2f} "
f"{row['SMAPE(%)']:>8.2f} "
f"{row['R2']:>8.4f} "
f"{row['Adj_R2']:>8.4f} "
f"{row['Bias']:>8.2f}")
📊 MODEL PERFORMANCE SUMMARY Model MAE RMSE MSE MAPE% SMAPE% R2 AdjR2 Bias ------------------------------------------------------------------------------------- LinearReg 3.80 4.49 20.20 3.14 3.07 0.9654 0.9308 2.20 SARIMAX 6.40 7.13 50.80 5.06 4.95 0.9130 0.8260 1.60